RNA-seq was run on 36 ovarian cancer cell lines, each in singlicate.
All 36 cell lines have 72h cisplatin IC50s determined by Kristin Adams and Kendra Wendt in the Lang Lab.
Add in links to Lab notebooks for IC50, RNAseq sample prep
library(knitr)
knit("src/load_inputs.Rmd")
processing file: src/load_inputs.Rmd
|
| | 0%
|
|..... | 8%
|
|.......... | 17% [load packages]
|
|................ | 25%
|
|..................... | 33% [unnamed-chunk-1]
|
|.......................... | 42%
|
|................................ | 50% [load metatable-full]
|
|..................................... | 58%
|
|.......................................... | 67% [load metatable]
|
|............................................... | 75%
|
|.................................................... | 83% [read count matrix]
|
|.......................................................... | 92%
|
|...............................................................| 100% [unnamed-chunk-2]
output file: load_inputs.md
[1] "load_inputs.md"
Subset Metadata to exclude PEO4, PEO6, and PEA1
metadata.subset = metadata[!(metadata$CellLine %in% c("PEO4", "PEO6", "PEA1")),]
countmatrix2.subset = countmatrix2[, colnames(countmatrix2) %in% metadata.subset$files]
TPM2.subset = TPM2[, colnames(TPM2) %in% metadata.subset$CellLine]
TPM.log.subset = TPM.log[, colnames(TPM.log) %in% metadata.subset$files]
print(countmatrix2.subset)
X14 X22 X21 X31 X30 X17 X32 X13 X5
A1BG 52 45 48 2 4 27 16 37 30
A1BG-AS1 108 80 88 15 6 107 34 80 28
A1CF 0 0 0 0 0 0 1 0 0
A2M 1161 0 10 7 23 11639 1 11 38
A2M-AS1 15 11 99 52 8 4 10 5 4
A2ML1 7 47 2 0 9 20 23 2 0
A2MP1 0 0 0 0 0 0 0 0 0
A3GALT2 0 0 0 0 0 0 0 0 0
A4GALT 369 39 284 153 57 154 26 143 37
A4GNT 0 0 0 0 0 0 0 0 0
AA06 0 0 0 0 0 0 0 0 0
AAAS 889 1392 1587 677 1397 1023 1416 1153 1454
AACS 2836 1984 713 1344 1113 1350 1340 686 639
AACSP1 2 0 1 4 0 0 1 1 0
AADAC 2 2 6 0 3 4 18 0 73
AADACL2 0 0 0 0 1 0 5 0 0
AADACL3 0 0 0 0 0 0 0 0 0
AADACL4 0 0 0 0 0 6 0 0 2
AADAT 235 296 633 118 223 74 436 187 225
AAED1 348 321 891 554 309 441 233 658 215
AAGAB 4120 3392 3090 1630 3544 2878 4825 3824 1508
AAK1 3673 2172 2322 2244 2444 4453 2252 2593 3770
AAMDC 215 216 491 237 258 511 2129 130 146
AAMP 2497 3582 2349 1826 1898 3290 4554 4058 1826
AANAT 2 0 0 0 1 1 5 1 2
AAR2 565 2711 1263 769 1468 1357 1720 937 1482
AARD 351 519 322 61 70 39 452 121 98
AARS 4174 8162 7419 7563 7822 5340 2623 7486 5111
AARS2 733 817 564 379 875 1129 583 851 895
AARSD1 159 855 482 423 809 347 396 411 343
AASDH 644 477 689 688 636 624 665 236 315
AASDHPPT 1261 1917 2510 2203 2379 4342 1308 2985 2821
AASS 586 174 637 206 42 4 120 19 754
AATF 1382 5882 2831 2341 3856 1354 2917 3659 4013
AATK 95 19 17 6 36 25 65 14 5
AATK-AS1 0 0 0 0 0 0 0 0 0
ABAT 107 743 126 232 92 1110 195 8 633
ABCA1 40 287 1755 190 413 1037 345 550 1828
ABCA10 3 5 7 3 0 8 2 1 6
ABCA11P 191 549 209 110 193 347 287 147 328
ABCA12 42 8 27 41 0 47 21 8 2
ABCA13 30 75 67 142 2 0 2 0 0
ABCA17P 0 3 1 0 1 3 5 0 23
ABCA2 622 781 801 457 554 1231 732 690 1230
ABCA3 546 156 1039 252 237 267 271 136 245
ABCA4 9 142 21 57 156 417 94 46 173
ABCA5 650 164 233 356 135 205 152 87 268
ABCA6 0 1 1 0 0 4 0 0 0
ABCA7 114 125 121 278 90 165 100 92 125
ABCA8 3 6 0 1 0 6 3 0 1
ABCA9 0 1 8 2 0 76 1 0 2
ABCB1 170 54 41 116 29 0 1 3 8
ABCB10 1225 629 732 832 1646 928 394 683 1609
ABCB11 0 0 0 0 0 0 10 0 0
ABCB4 19 70 12 9 0 0 12 0 0
ABCB5 1 56 33 0 33 223 103 9 78
ABCB6 620 663 133 241 208 495 676 947 147
ABCB7 1583 975 1422 1400 2329 1379 1573 1179 1414
ABCB8 515 574 500 205 341 857 260 343 559
ABCB9 202 157 74 40 72 151 144 104 49
ABCC1 1279 4911 4665 2207 2293 5168 3316 7774 2832
ABCC10 293 242 199 134 180 492 283 332 206
ABCC11 2 3 11 4 5 7 9 0 2
ABCC12 0 0 0 2 0 0 0 1 0
ABCC13 1 0 0 2 0 0 13 0 0
ABCC2 10 23 24 47 45 29 7 6 14
ABCC3 2445 255 1201 488 459 423 89 35 81
ABCC4 455 1054 1383 1645 807 1645 881 2064 6141
ABCC5 1828 1201 592 724 612 1836 497 1073 1360
ABCC5-AS1 0 0 0 1 0 0 0 0 0
ABCC6 50 91 30 4 27 72 20 76 7
ABCC6P1 24 18 1 5 4 43 9 0 0
ABCC6P2 36 34 17 0 3 12 14 19 6
ABCC8 16 0 1 0 0 2 7 0 0
ABCC9 1 0 3 6 0 1 1 0 0
ABCD1 379 104 452 221 229 363 240 192 389
ABCD2 4 0 0 0 0 0 0 0 1
ABCD3 4013 2019 1834 2817 4969 5719 2284 4875 5167
ABCD4 352 352 371 309 133 517 1152 480 203
ABCE1 5269 6565 6960 4720 12020 5439 7210 10354 5768
ABCF1 7913 4562 4849 2529 4211 6006 6770 4739 3134
ABCF2 3929 4840 6686 2916 4970 5439 2695 5436 4722
ABCF3 2458 1556 1146 1123 1317 1298 1457 1612 1458
ABCG1 157 86 63 5 46 251 43 100 535
ABCG2 263 43 223 357 47 11 12 23 26
ABCG4 13 16 9 2 18 15 17 3 25
ABCG5 3 5 0 0 1 0 0 0 0
ABCG8 0 1 0 1 0 0 3 0 0
ABHD1 22 7 9 4 5 0 15 7 4
ABHD10 1684 1800 932 707 1198 1374 1304 1582 838
X24 X33
A1BG 4 1
A1BG-AS1 20 9
A1CF 0 0
A2M 0 113
A2M-AS1 21 26
A2ML1 0 21
A2MP1 0 0
A3GALT2 0 0
A4GALT 349 82
A4GNT 0 0
AA06 0 0
AAAS 665 1172
AACS 1051 1081
AACSP1 0 2
AADAC 148 3
AADACL2 0 0
AADACL3 0 0
AADACL4 0 0
AADAT 30 211
AAED1 361 303
AAGAB 3164 2449
AAK1 1415 2508
AAMDC 73 267
AAMP 2175 1920
AANAT 0 0
AAR2 695 1066
AARD 3 298
AARS 5132 6165
AARS2 355 279
AARSD1 570 512
AASDH 692 1034
AASDHPPT 1652 2215
AASS 269 231
AATF 4076 3884
AATK 0 4
AATK-AS1 0 0
ABAT 7 381
ABCA1 204 452
ABCA10 240 5
ABCA11P 113 264
ABCA12 84 66
ABCA13 17 20
ABCA17P 0 5
ABCA2 554 410
ABCA3 116 1319
ABCA4 12 4652
ABCA5 219 219
ABCA6 4 0
ABCA7 231 53
ABCA8 0 0
ABCA9 3 0
ABCB1 0 11
ABCB10 937 891
ABCB11 1 0
ABCB4 3 21
ABCB5 0 0
ABCB6 69 32
ABCB7 811 1159
ABCB8 73 882
ABCB9 42 70
ABCC1 2692 4727
ABCC10 62 145
ABCC11 1 3
ABCC12 0 0
ABCC13 0 1
ABCC2 12 73
ABCC3 2115 1089
ABCC4 1223 263
ABCC5 876 1375
ABCC5-AS1 0 0
ABCC6 11 53
ABCC6P1 2 14
ABCC6P2 17 28
ABCC8 0 0
ABCC9 0 49
ABCD1 92 225
ABCD2 0 1
ABCD3 2098 3728
ABCD4 239 415
ABCE1 5612 4200
ABCF1 3112 1545
ABCF2 1331 5274
ABCF3 1079 1323
ABCG1 7 57
ABCG2 297 39
ABCG4 0 31
ABCG5 0 0
ABCG8 0 1
ABHD1 7 2
ABHD10 940 1313
[ reached getOption("max.print") -- omitted 29654 rows ]
dds.subset <- DESeqDataSetFromMatrix(countData = countmatrix2.subset,
colData = metadata.subset,
design = ~ PlatinumSensitivity)
converting counts to integer mode
Don’t need to filter - this is done automatically
path = paste0(deseq_Rdata_folder,
"/dds_HGSC_subset.RData")
# Run DESeq
# dds.subset <- DESeq(dds.subset)
# save(dds.subset, file = path)
# Alternatively, if this has already been run, can load dds object
load(path)
res.subset <- results(dds.subset, contrast=c("PlatinumSensitivity", "resistant", "sensitive"))
res.subset
log2 fold change (MLE): PlatinumSensitivity resistant vs sensitive
Wald test p-value: PlatinumSensitivity resistant vs sensitive
DataFrame with 29744 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 2.35873e+01 0.00919303 1.058153 0.00868781 0.99306822 0.999033
A1BG-AS1 5.05892e+01 0.17653501 0.845348 0.20883117 0.83458004 0.992631
A1CF 8.00813e-02 -0.39874829 3.357670 -0.11875743 0.90546754 NA
A2M 1.06465e+03 -5.64415880 1.526628 -3.69713990 NA NA
A2M-AS1 2.35457e+01 2.13984872 0.758420 2.82145497 0.00478063 0.325398
... ... ... ... ... ... ...
ZYG11A 587.519 -0.252313 0.488149 -0.516877 0.6052417 0.976640
ZYG11B 2153.760 -0.700565 0.414343 -1.690787 0.0908775 0.828143
ZYX 5119.927 0.245842 0.373398 0.658392 0.5102863 0.972000
ZZEF1 1760.039 0.609198 0.341440 1.784203 0.0743908 0.792812
ZZZ3 4145.632 0.073347 0.272978 0.268692 0.7881664 0.987312
path = paste0(deseq_output_folder,
"/DESeq_HGSC_subset.csv")
write.csv(as.data.frame(res.subset), file = path)
Filter res.subset for padj < 0.05 and |log2FC| >= 1.2
res.subset.filtered <- as.data.frame(res.subset) %>%
filter(padj<0.05)%>%
filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.subset.filtered
Here we performed normal transformation [log2(n+1)], variance stabilized transformation, and regularized log tranformation to improve visualization of the data values. To speed up subsequent re-runs, we have hidden analysis for non-vst.
# ntd <- normTransform(dds.subset)
# meanSdPlot(assay(ntd))
vsd.subset <- vst(dds.subset)
meanSdPlot(assay(vsd.subset))
vsd.subset.df = as.data.frame(assay(vsd.subset))
# rld <- rlog(dds.subset)
# meanSdPlot(assay(rld))
Based on this data, variance-stabilized transformation lead to the lowest standard deviation between samples and was mostly located towards the high expression transcripts, as might be expected.
pcaData.subset <- plotPCA(vsd.subset, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData.subset, "percentVar"))
myColors <- c("#76AB7E", "#63E678", "#1D32FB", "#7B87FD", "#01BD1F", "#E8A426", "#0B7C1D", "#BB19E7")
names(myColors) <- levels(vsd.subset$Subtype)
colScale <- scale_colour_manual(name = "Subtype",values = myColors)
pca.subset <- ggplot(pcaData.subset, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
geom_point(size=3) +
geom_text(hjust=0, vjust=0) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
theme_classic() +
colScale
pca.subset
path = paste0(deseq_output_folder,
"/pca_HGSC_subset.pdf")
ggsave(path, pca.subset)
Saving 7.29 x 4.51 in image
PCA plot groups samples roughly by subtype
Plotting the top 100 most highly expressed genes:
select <- order(rowMeans(counts(dds.subset,normalized=TRUE)),
decreasing=TRUE)[1:100]
df <- as.data.frame(colData(dds.subset)[,c("Subtype", "PlatinumSensitivity")])
pheatmap(assay(vsd.subset)[select,], cluster_rows=FALSE, show_rownames=FALSE,
labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df)
Pull genes contributing to principal components 1 & 2
# Not sure what this was for:
# PCA <- prcomp(TPM.log, scale=TRUE)
# PCA.mat <- as.data.frame(PCA$x)
# PCA.PC1filt <- PCA.mat %>% filter(PC1 < quantile(PCA.mat[,"PC1"], .2)[[1]])
TPM3 <- TPM2.subset[rowSums(TPM2.subset)>1000,]
counts.sc <- t(TPM3)
dist <- dist(counts.sc)
clust <- hclust(dist, method="average")
dend1.subset <- as.dendrogram(clust)
par(mar=c(10,2,1,1))
my_colors <- ifelse(metadata.subset$PlatinumSensitivity=="sensitive", "red",
ifelse(metadata.subset$PlatinumSensitivity=="resistant", "blue",
ifelse(metadata.subset$PlatinumSensitivity=="intermediate", "yellow", "white" )))
plot(dend1.subset)
colored_bars(colors = my_colors, dend = dend1.subset, rowLabels = "PlatinumSensitivity")
I tried a lot of different iterations of this (including: various cutoffs for variance of genes, genes contributing most to first and second principal components, more highly expressed genes, TPM vs vst data, clustering methods), but the fundamental problem is that isogenic pairs rarely cluster anywhere near each other, even though the PCA analysis shows this relationship. I don’t know that this is a reliable method for determining relatedness/subtyping, unless a robust gene set is developed.(This is from Jessi)
vsd.subset.df <-as.data.frame(assay(vsd.subset))
pheatmap(vsd.subset.df[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))
pheatmap(TPM.log.subset[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], annotation_col=df, color=colorRampPalette(c("white", "red"))(10))
center_scale <- function(x) {
scale(x, scale=FALSE)
}
vsd.subset.meancenter <- apply(vsd.subset.df, 1, center_scale)
vsd.subset.meancenter <-t(vsd.subset.meancenter)
colnames(vsd.subset.meancenter) <- colnames(vsd.subset.df)
vsd.subset.meancenter <- as.data.frame(vsd.subset.meancenter)
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
extreme_val = max(
abs(min(vsd.subset.meancenter[rownames(res.subset.filtered),])),
abs(max(vsd.subset.meancenter[rownames(res.subset.filtered),]))
)
myBreaks <- c(seq(-1 * extreme_val, 0, length.out=ceiling(50/2) + 1),
seq(extreme_val/50, extreme_val, length.out=floor(50/2)))
plotHeatmap = pheatmap(vsd.subset.meancenter[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks)
print(plotHeatmap)
ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_heatmap.svg"), plot = plotHeatmap, device = "svg")
Saving 8 x 10 in image
# These genes chosen by Jessi for the purpose of figures in paper
selected_labels = c("SULT1A3", "CCL5", "FN1", "CD70", "IL1RN", "ZEB2", "A2M-AS1", "TAP2", "CEBPA", "BARX1", "COL2A1", "CT45A5")
labels = ifelse(rownames(res.subset) %in% selected_labels,
rownames(res.subset),
"")
# test_points = res.subset[rownames(res.subset) %in% selected_labels,]
# print(as.data.frame(test_points))
# labels = rownames(test_points)
plotVolcano = res.subset %>%
ggplot(aes(x=log2FoldChange, y=-log10(padj), label=labels)) +
geom_point(alpha=0.5) +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
geom_text_repel(segment.size = .2, size = 2, max.overlaps = 1000, force = 10, force_pull = 1, nudge_y = 1)
# geom_text_repel(segment.size = .2, size = 2, max.overlaps = 10, force = 1, force_pull = 60, nudge_y = 1)
print(plotVolcano)
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_volcano.svg"), plot = plotVolcano, device = "svg")
Saving 7 x 7 in image
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a padj Cutoff of 0.15.
gene_list.subset <- res.subset$log2FoldChange
names(gene_list.subset) <- rownames(res.subset)
gene_list.subset <- sort(gene_list.subset, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2024)
gsea_res.subset <- gseGO(gene_list.subset, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.15)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, :
There are ties in the preranked stats (9.17% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
# Format output
gsea_res_df.subset <- as.data.frame(gsea_res.subset)
gsea_res_df.subset <- gsea_res_df.subset %>%
mutate(original_row_num = row_number())
gsea_res_df.subset <- gsea_res_df.subset[order(gsea_res_df.subset$NES, decreasing = TRUE),]
row.names(gsea_res_df.subset) <- gsea_res_df.subset$ID
NES is the normalized enrichment score.
gsea_res_df_short.subset <- gsea_res_df.subset[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short.subset$"core_enrichment_genes" <- gsea_res_df.subset$core_enrichment
Volcano Plot of gene ontology (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.subset) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.subset))) +
geom_point() +
scale_alpha_manual(0.5) +
theme_minimal() +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 59 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gsea_res_df_short.up.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES >= 0)
gsea_res_df_short.up.subset = gsea_res_df_short.up.subset[order(gsea_res_df_short.up.subset$p.adjust), ]
path = paste0(deseq_output_folder,
"/subset_cisplatin_resistant_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up.subset, file = path)
gsea_res_df_short.up.subset
Use Revigo to cluster upregulated pathways with pvalue < 0.15
revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,])
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))
if (nrow(revigo_input.cellline.up.subset) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster upregulated pathways with pvalue < 0.05
revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,])
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))
if (nrow(revigo_input.cellline.up.subset) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
gsea_res_df_short.down.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES <= 0)
path = paste0(deseq_output_folder,
"/subset_cisplatin_resistant_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down.subset, file = path)
gsea_res_df_short.down.subset
Use Revigo to cluster downregulated pathways with pvalue < 0.15
revigo_input.cellline.down.subset <- gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.down.subset) <- rownames(gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,])
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down.subset),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down.subset$p.adjust), rownames(revigo_input.cellline.down.subset))
if (nrow(revigo_input.cellline.down.subset) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}